package org.biogroovy.util

/**
 * This class contains a list of the accession types and the regular expressions
 * for them.
 *
 * @author markfortner
 *
 * @see http://www.ncbi.nlm.nih.gov/RefSeq/key.html
 */
class AccessionType {

    /**
     * Alternate complete genomic molecule. This prefix is used for records that
     * are provided to reflect an alternate assembly or annotation. Primarily used
     * for viral, prokaryotic records.
     */
    public static final String ALT_COMPLEX_GENOMIC = "ALT_COMPLEX_GENOMIC";
    public static final String REGEX_ALT_COMPLEX_GENOMIC = "AC_\\d{6}"

    /**
     * Protein products; alternate protein record. This prefix is used for records
     * that are provided to reflect an alternate assembly or annotation. The AP_ prefix
     * was originally designated for bacterial proteins but this usage was changed.
     */
    public static final String ALT_PROTEIN_RECORD = "ALT_PROTEIN_RECORD";
    public static final String REGEX_ALT_PROTEIN_RECORD = "AP_\\d{6}"



    /**
     * Complete genomic molecules including genomes, chromosomes, organelles, plasmids.
     */
    public static final String COMPLETE_GENOMIC = "COMPLETE_GENOMIC";
    public static final String REGEX_COMPLETE_GENOMIC = "NC_\\d{6}"

    /**
     * Incomplete genomic region; supplied to support the NCBI genome annotation
     * pipeline. Represents either non-transcribed pseudogenes, or larger regions
     * representing a gene cluster that is difficult to annotate via automatic methods.
     */
    public static final String INCOMPLETE_GENOMIC = "INCOMPLETE_GENOMIC";
    public static final String REGEX_INCOMPLETE_GENOMIC = "NG_\\d{6}";

    /**
     * Transcript products; mature messenger RNA (mRNA) transcripts.
     */
    public static final String TRANSCRIPT_PROD = "TRANSCRIPT_PROD";
    public static final String REGEX_TRANSCRIPT_PROD = "NM_\\d{6}|NM_\\d{9}"

    /**
     * Protein products; primarily full-length precursor products but may include
     * some partial proteins and mature peptide products.
     */
    public static final String PROTEIN_PROD2 = "PROTEIN_PROD2";
    public static final String REGEX_PROTEIN_PROD2 = "NP_\\d{6}||NP_\\d{9}"

    /**
     * Non-coding transcripts including structural RNAs, transcribed pseudogenes, and others.
     */
    public static final String NON_CODE_TRANS="NON_CODE_TRANS";
    public static final String REGEX_NON_CODE_TRANS="NR_\\d{6}";

    /**
     * Intermediate genomic assemblies of BAC and/or Whole Genome Shotgun sequence data.
     */
    public static final String INT_GENOMIC_ASSEMBLY="INT_GENOMIC_ASSEMBLY";
    public static final String REGEX_INT_GENOMIC_ASSEMBLY="NT_\\d{6}";

    /**
     * Intermediate genomic assemblies of BAC or Whole Genome Shotgun sequence data.
     */
    public static final String INT_GENOMIC_ASSEMBLY2="INT_GENOMIC_ASSEMBLY2";
    public static final String REGEX_INT_GENOMIC_ASSEMBLY2 = "NW_\\d{6}|NW_\\d{9}";

    /**
     * A collection of whole genome shotgun sequence data for a project. Accessions
     * are not tracked between releases. The first four characters following the
     * underscore (e.g. 'ABCD') identifies a genome project.
     */
    public static final String WHOLE_GENOME_SEQ="WHOLE_GENOME_SEQ";
    public static final String REGEX_WHOLE_GENOME_SEQ="NZ_\\p{ALPHA}{4}\\d{8}";

    /**
     * Transcript products; model mRNA provided by a genome annotation process; 
     * sequence corresponds to the genomic contig.
     */
    public static final String TRANSCRIPT_PROD2="TRANSCRIPT_PROD2"
    public static final String REGEX_TRANSCRIPT_PROD2="XM_\\d{6}|XM_\\d{9}"

    /**
     * Protein products; model proteins provided by a genome annotation process;
     * sequence corresponds to the genomic contig.
     */
    public static final String PROTEIN_PROD3="PROTEIN_PROD3";
    public static final String REGEX_PROTEIN_PROD3="XP_\\d{6}|XP_\\d{9}";

    /**
     * Transcript products; model non-coding transcripts provided by a genome
     * annotation process; sequence corresponds to the genomic contig.
     */
    public static final String TRANSCRIPT_PROD3="TRANSCRIPT_PROD3";
    public static final String REGEX_TRANSCRIPT_PROD3 = "XR_\\d{6}";


    /**
     * Protein products; no corresponding transcript record provided. Primarily
     * used for bacterial, viral, and mitochondrial records.
     */
    public static final String PROTEIN_PROD4="PROTEIN_PROD4";
    public static final String REGEX_PROTEIN_PROD4="YP_\\d{6}|YP_\\d{9}";

    /**
     * Protein products; annotated on NZ_ accessions (often via computational methods).
     */
    public static final String PROTEIN_PROD5="PROTEIN_PROD5";
    public static final String REGEX_PROTEIN_PROD5="ZP_\\d{8}";

    /**
     * Genomic records that represent an assembly which does not reflect the
     * structure of a real biological molecule. The assembly may represent an
     * unordered assembly of unplaced scaffolds, or it may represent an assembly
     * of DNA sequences generated from a biological sample that may not represent
     * a single organism.
     */
    public static final String GENOMIC_REC = "GENOMIC_REC";
    public static final String REGEX_GENOMIC_REC="NS_\\d{6}";

}

